Overview of Activity3

  • Try to Predict/Classify if a player is gonna support
  • Try to Predict a players Rank
library(rpart.plot)
## Loading required package: rpart

Data

data = read.csv("tldata.csv")
head(data)
summary(data)
##        X            Standing       Username           Country         
##  Min.   :    1   Min.   :    1   Length:39769       Length:39769      
##  1st Qu.: 9943   1st Qu.: 9943   Class :character   Class :character  
##  Median :19885   Median :19885   Mode  :character   Mode  :character  
##  Mean   :19885   Mean   :19885                                        
##  3rd Qu.:29827   3rd Qu.:29827                                        
##  Max.   :39769   Max.   :39769                                        
##       Wins         Games.Played       Winrate            APM        
##  Min.   :   0.0   Min.   :  10.0   Min.   :0.0000   Min.   :  1.05  
##  1st Qu.:  31.0   1st Qu.:  63.0   1st Qu.:0.4844   1st Qu.: 15.11  
##  Median :  83.0   Median : 159.0   Median :0.5087   Median : 23.10  
##  Mean   : 158.7   Mean   : 311.2   Mean   :0.4951   Mean   : 30.56  
##  3rd Qu.: 202.0   3rd Qu.: 389.0   3rd Qu.:0.5327   3rd Qu.: 38.13  
##  Max.   :4001.0   Max.   :8142.0   Max.   :1.0000   Max.   :227.68  
##       PPS              VS         Glicko.Rating  Rating.Deviation
##  Min.   :0.300   Min.   :  1.75   Min.   : 265   Min.   : 60.00  
##  1st Qu.:0.940   1st Qu.: 32.44   1st Qu.:1168   1st Qu.: 62.00  
##  Median :1.170   Median : 49.86   Median :1479   Median : 72.00  
##  Mean   :1.259   Mean   : 64.76   Mean   :1496   Mean   : 74.94  
##  3rd Qu.:1.480   3rd Qu.: 81.44   3rd Qu.:1774   3rd Qu.: 86.00  
##  Max.   :4.270   Max.   :438.21   Max.   :4276   Max.   :100.00  
##   Tetra.Rating          Rank           Active.This.Week   Supporter.Status. 
##  Min.   :   11.47   Length:39769       Length:39769       Length:39769      
##  1st Qu.: 4531.12   Class :character   Class :character   Class :character  
##  Median : 9509.57   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 9862.82                                                           
##  3rd Qu.:14693.09                                                           
##  Max.   :24752.28                                                           
##   RankColour       
##  Length:39769      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(data)
## 'data.frame':    39769 obs. of  17 variables:
##  $ X                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Standing         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Username         : chr  "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
##  $ Country          : chr  "Japan" "United States" "Korea, Republic of" "Japan" ...
##  $ Wins             : int  1026 347 511 358 320 270 179 323 164 356 ...
##  $ Games.Played     : int  1233 394 670 437 454 358 229 463 206 533 ...
##  $ Winrate          : num  0.832 0.881 0.763 0.819 0.705 ...
##  $ APM              : num  228 213 195 203 191 ...
##  $ PPS              : num  4.27 3.92 3.39 3.83 3.42 3.84 3.44 3.56 3.04 3.84 ...
##  $ VS               : num  438 421 389 394 391 ...
##  $ Glicko.Rating    : int  4276 4026 3963 3944 3931 3899 3862 3854 3853 3767 ...
##  $ Rating.Deviation : int  85 71 72 76 72 81 67 65 86 68 ...
##  $ Tetra.Rating     : num  24752 24641 24601 24591 24579 ...
##  $ Rank             : chr  "X+" "X+" "X+" "X+" ...
##  $ Active.This.Week : chr  "Yes" "Yes" "Yes" "No" ...
##  $ Supporter.Status.: chr  "Yes" "Yes" "No" "Yes" ...
##  $ RankColour       : chr  "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
## Preparing the data

#make some things factors
data$Country           = as.factor(data$Country)

data$Country = sub("Korea, Republic of", "Republic of Korea", data$Country)
data$Country = sub("Venezuela, Bolivarian Republic of", "Republic of Venezuela", data$Country)
data$Country = sub("Macedonia, the former Yugoslav Republic of", "Republic of Macedonia", data$Country)
data$Country           = as.factor(data$Country)

data$Rank              = factor(data$Rank, levels=c("D","D+","C-","C","C+","B-","B","B+","A-","A","A+","S-","S","S+","SS","U","X","X+"))

data$Active.This.Week = as.factor(data$Active.This.Week)
data$Active.This.Week = ifelse(data$Active.This.Week == "Yes", 1, 0)

data$Supporter.Status. = as.factor(data$Supporter.Status.)
data$Supporter.Status. = ifelse(data$Supporter.Status. == "Yes", 1, 0)

data$Wins = as.numeric(data$Wins)
data$Games.Played = as.numeric(data$Games.Played)

data$Username = as.character(data$Username)


#remove index (standing does this)
data$X = NULL
str(data)
## 'data.frame':    39769 obs. of  16 variables:
##  $ Standing         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Username         : chr  "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
##  $ Country          : Factor w/ 225 levels "","Afghanistan",..: 103 212 162 103 157 145 162 212 89 95 ...
##  $ Wins             : num  1026 347 511 358 320 ...
##  $ Games.Played     : num  1233 394 670 437 454 ...
##  $ Winrate          : num  0.832 0.881 0.763 0.819 0.705 ...
##  $ APM              : num  228 213 195 203 191 ...
##  $ PPS              : num  4.27 3.92 3.39 3.83 3.42 3.84 3.44 3.56 3.04 3.84 ...
##  $ VS               : num  438 421 389 394 391 ...
##  $ Glicko.Rating    : int  4276 4026 3963 3944 3931 3899 3862 3854 3853 3767 ...
##  $ Rating.Deviation : int  85 71 72 76 72 81 67 65 86 68 ...
##  $ Tetra.Rating     : num  24752 24641 24601 24591 24579 ...
##  $ Rank             : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 18 18 18 18 18 18 ...
##  $ Active.This.Week : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ Supporter.Status.: num  1 1 0 1 1 1 0 1 1 0 ...
##  $ RankColour       : chr  "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
set.seed(4)

splitting into test and train

num_of_splits = 10
splits = sample( rep(1:num_of_splits, ceiling(nrow(data)/num_of_splits) ), nrow(data) )
#check for uniform-distribution
summary(as.factor(splits))
##    1    2    3    4    5    6    7    8    9   10 
## 3977 3977 3977 3977 3977 3977 3976 3977 3977 3977
str(splits)
##  int [1:39769] 7 9 8 4 4 6 3 9 10 10 ...
train = data[splits!=1,]
test = data[splits==1,]

#nrow(train) + nrow(test)
#nrow(test)

#dynamically
#testdata(7)
#=> train
#=>test
data_normalized = data

for(i in c(1,4,5,6,7,8,9,10,11,12)) {
  data_normalized[,i] = scale(
                              data_normalized[,i], 
                              center=min(data_normalized[,i]), 
                              scale=max(data_normalized[,i])-min(data_normalized[,i]))
}

str(data_normalized, give.attr=F)
## 'data.frame':    39769 obs. of  16 variables:
##  $ Standing         : num [1:39769, 1] 0 0.0000251 0.0000503 0.0000754 0.0001006 ...
##  $ Username         : chr  "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
##  $ Country          : Factor w/ 225 levels "","Afghanistan",..: 103 212 162 103 157 145 162 212 89 95 ...
##  $ Wins             : num [1:39769, 1] 0.2564 0.0867 0.1277 0.0895 0.08 ...
##  $ Games.Played     : num [1:39769, 1] 0.1504 0.0472 0.0812 0.0525 0.0546 ...
##  $ Winrate          : num [1:39769, 1] 0.832 0.881 0.763 0.819 0.705 ...
##  $ APM              : num [1:39769, 1] 1 0.937 0.854 0.889 0.838 ...
##  $ PPS              : num [1:39769, 1] 1 0.912 0.778 0.889 0.786 ...
##  $ VS               : num [1:39769, 1] 1 0.96 0.887 0.898 0.892 ...
##  $ Glicko.Rating    : num [1:39769, 1] 1 0.938 0.922 0.917 0.914 ...
##  $ Rating.Deviation : num [1:39769, 1] 0.625 0.275 0.3 0.4 0.3 0.525 0.175 0.125 0.65 0.2 ...
##  $ Tetra.Rating     : num [1:39769, 1] 1 0.995 0.994 0.993 0.993 ...
##  $ Rank             : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 18 18 18 18 18 18 ...
##  $ Active.This.Week : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ Supporter.Status.: num  1 1 0 1 1 1 0 1 1 0 ...
##  $ RankColour       : chr  "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
#handling flags
#turn rank factor into flags
for (i in 1:length(levels(data_normalized$Rank))) {
  data_normalized[,ncol(data_normalized)+1] = 0 #make new column
  #set appropriate ones to 1
  data_normalized[
    data_normalized$Rank == levels(data_normalized$Rank)[i], #select rows matching rank
    ncol(data_normalized)] = 1 #select last column (just added)
  
  varname = sprintf( "flag%sRank", levels(data_normalized$Rank)[i] )
  varname = sub("+", "Plus", varname, fixed=T) #fixed=T treats '+' literal
  varname = sub("-", "Minus", varname, fixed=T)
  names(data_normalized)[ ncol(data_normalized) ] = varname
}

# remove orig rank var and remove one flag
data_normalized$Rank = NULL
data_normalized$flagDrank = NULL

str(data_normalized, give.attr=F)
## 'data.frame':    39769 obs. of  33 variables:
##  $ Standing         : num [1:39769, 1] 0 0.0000251 0.0000503 0.0000754 0.0001006 ...
##  $ Username         : chr  "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
##  $ Country          : Factor w/ 225 levels "","Afghanistan",..: 103 212 162 103 157 145 162 212 89 95 ...
##  $ Wins             : num [1:39769, 1] 0.2564 0.0867 0.1277 0.0895 0.08 ...
##  $ Games.Played     : num [1:39769, 1] 0.1504 0.0472 0.0812 0.0525 0.0546 ...
##  $ Winrate          : num [1:39769, 1] 0.832 0.881 0.763 0.819 0.705 ...
##  $ APM              : num [1:39769, 1] 1 0.937 0.854 0.889 0.838 ...
##  $ PPS              : num [1:39769, 1] 1 0.912 0.778 0.889 0.786 ...
##  $ VS               : num [1:39769, 1] 1 0.96 0.887 0.898 0.892 ...
##  $ Glicko.Rating    : num [1:39769, 1] 1 0.938 0.922 0.917 0.914 ...
##  $ Rating.Deviation : num [1:39769, 1] 0.625 0.275 0.3 0.4 0.3 0.525 0.175 0.125 0.65 0.2 ...
##  $ Tetra.Rating     : num [1:39769, 1] 1 0.995 0.994 0.993 0.993 ...
##  $ Active.This.Week : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ Supporter.Status.: num  1 1 0 1 1 1 0 1 1 0 ...
##  $ RankColour       : chr  "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
##  $ flagDRank        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagDPlusRank    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagCMinusRank   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagCRank        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagCPlusRank    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagBMinusRank   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagBRank        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagBPlusRank    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagAMinusRank   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagARank        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagAPlusRank    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagSMinusRank   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagSRank        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagSPlusRank    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagSSRank       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagURank        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagXRank        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ flagXPlusRank    : num  1 1 1 1 1 1 1 1 1 1 ...
#train
#make training set with higher proportion of supporters (50/50 split)
allsupporters = data[data_normalized$Supporter.Status. == 1,]
allnonsupporters = data[data_normalized$Supporter.Status. == 0,]

allsupporters
allnonsupporters
train_balanced = rbind( allsupporters, 
               allnonsupporters[sample(1:nrow(allnonsupporters), nrow(allsupporters)),] )
#shuffle it
train_balanced = train_balanced[sample(1:nrow(train), nrow(train)),]
# Z-score standardization
data_standardized = data

#standardization of numeric variables, for decision trees is not necessary
data_standardized$Wins.s = (data$Wins - mean(data$Wins))/sd(data$Wins)
data_standardized$Games.Played.s = (data$Games.Played - mean(data$Games.Played))/sd(data$Games.Played)
data_standardized$Tetra.Rating.s = (data$Tetra.Rating - mean(data$Tetra.Rating))/sd(data$Tetra.Rating)

data_standardized

Regession

Linear Regression

Multiple Regression


KNN


Decision Tree

# Tree data
#data_without_support = data
#test_without_support = test
#train_without_support = train

#drop support
#data_without_support$Supporter.Status. = NULL
#test_without_support$Supporter.Status. = NULL
#train_without_support$Supporter.Status. = NULL

#Train: drop username, country
train_without_username = train
train_without_username$Username = NULL
train_without_username$Country = NULL

#Test: drop username, country
test_without_username = test
test_without_username$Username = NULL
test_without_username$Country = NULL

train_balanced_without_username = train_balanced
train_balanced_without_username$Username = NULL
train_balanced_without_username$Country = NULL

#train_without_username$Standing = NULL
#train_without_username
train_balanced_without_username

CART

CART v1

set.seed(1)
#cartfittrain = rpart(Supporter.Status.~., dat=train_without_username, method="class", control=rpart.control(minsplit=4, cp=0.0015))
cartfittrain = rpart(Supporter.Status.~., dat=train_balanced_without_username, method="class", control=rpart.control(minsplit=4, cp=0.003))
#cartfittrain = rpart(Supporter.Status.~ Glicko.Rating+Country, dat=train_without_username, method="class")
rpart.plot(cartfittrain, type=2)

#control=rpart.control(cp=0.0005)
#min-splits

CART v2

train_without_username
cartfittrain2 = rpart(Supporter.Status.~., dat=train_without_username[train_without_username$Standing<400,], method="class", control=rpart.control(minsplit=4, cp=0.03))

rpart.plot(cartfittrain2, type=2)

str(train_without_username)
## 'data.frame':    35792 obs. of  14 variables:
##  $ Standing         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Wins             : num  1026 347 511 358 320 ...
##  $ Games.Played     : num  1233 394 670 437 454 ...
##  $ Winrate          : num  0.832 0.881 0.763 0.819 0.705 ...
##  $ APM              : num  228 213 195 203 191 ...
##  $ PPS              : num  4.27 3.92 3.39 3.83 3.42 3.84 3.44 3.56 3.04 3.84 ...
##  $ VS               : num  438 421 389 394 391 ...
##  $ Glicko.Rating    : int  4276 4026 3963 3944 3931 3899 3862 3854 3853 3767 ...
##  $ Rating.Deviation : int  85 71 72 76 72 81 67 65 86 68 ...
##  $ Tetra.Rating     : num  24752 24641 24601 24591 24579 ...
##  $ Rank             : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 18 18 18 18 18 18 ...
##  $ Active.This.Week : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ Supporter.Status.: num  1 1 0 1 1 1 0 1 1 0 ...
##  $ RankColour       : chr  "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...

CART v3

cartfittest2 = rpart(Supporter.Status.~., dat=test_without_username[train_without_username$Standing<400,], method="class", control=rpart.control(minsplit=4, cp=0.03))

rpart.plot(cartfittest2, type=2)

str(test_without_username)
## 'data.frame':    3977 obs. of  14 variables:
##  $ Standing         : int  12 24 35 38 96 97 98 102 131 142 ...
##  $ Wins             : num  152 263 124 736 181 695 680 98 783 183 ...
##  $ Games.Played     : num  205 397 166 1354 275 ...
##  $ Winrate          : num  0.742 0.662 0.747 0.544 0.658 ...
##  $ APM              : num  191 173 172 153 139 ...
##  $ PPS              : num  3.59 3.54 3.16 3.49 2.45 2.62 2.5 3.18 2.78 2.82 ...
##  $ VS               : num  374 352 334 310 298 ...
##  $ Glicko.Rating    : int  3722 3599 3523 3467 3231 3218 3218 3215 3103 3060 ...
##  $ Rating.Deviation : int  64 69 88 63 76 64 64 93 65 69 ...
##  $ Tetra.Rating     : num  24424 24297 24192 24146 23767 ...
##  $ Rank             : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 17 17 17 17 17 17 ...
##  $ Active.This.Week : num  1 0 0 1 1 1 1 0 1 0 ...
##  $ Supporter.Status.: num  0 1 0 0 0 1 0 0 0 0 ...
##  $ RankColour       : chr  "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...

CART v4


Neural Networks


Clustering